#######################################################################
#####                                                            ######
#####   Input: Debates data; dictionary words                    ######
#####   Output: Measure of debate topic                          ######
#####                                                            ######
#######################################################################

# Load libraries

library(quanteda) # v3.3.1
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.4
library(data.table) # v1.14.8
library(text2vec) # v0.6.3

## Source dictionaries

keywords <- read.csv("data/dictionaries/seed_words.csv", stringsAsFactors = FALSE)

defence_words <- keywords$Defence[1:47]
economy_words <- keywords$FinanceEconomy[1:50]
agriculture_words <- keywords$Agriculture[1:39]
health_words <- keywords$Health[1:46]
children_words <- keywords$ChildrenFamily[1:34]
education_words <- keywords$Education[1:48]
social_words <- keywords$SocialWelfare[1:53]
trade_words <- keywords$ForeignTrade[1:46] 
environment_words <- keywords$Environment[1:50]
crime_words <- keywords$CrimePolicing[1:36]
transport_words <- keywords$Transportation[1:34]

debate_dictionary_to_use <- dictionary(list(defence = defence_words,
                                            economy = economy_words, 
                                            agriculture = agriculture_words, 
                                            health = health_words, 
                                            children = children_words, 
                                            education = education_words,
                                            social = social_words, 
                                            trade = trade_words,
                                            environment = environment_words, 
                                            crime = crime_words, 
                                            transport = transport_words))

save(debate_dictionary_to_use, file = "working/debate_dictionaries.Rdata")

## Load data 

load("data/debates.Rdata")
load("working/debate_dictionaries.Rdata")

# Drop pre-Blair years 

debates <- debates[debates$parliamentary_term!="1992-1997"]

## Loop over years

year_out_list <- list()
i <- 0
y <- unique(debates$yearmon)[1]

for(y in unique(debates$yearmon)){
  
  print(y)
  i<-i+1
  
  debates_test <- debates[yearmon == y]
  
  # Turn to corpus 
  debates_test_corpus <- corpus(debates_test, text_field = "parent")
  
  ## Replace "negation word" with "negation_word"
  
  debates_dfm <- dfm(debates_test_corpus)
  
  ## Calculate dictionary counts
  
  dfm_liwc <- dfm(debates_dfm, dictionary = debate_dictionary_to_use)
  
  dict_scores <- as.data.frame(as.matrix(dfm_liwc))
  names(dict_scores) <- paste0("dic_",names(dict_scores))
  
  ## Merge together 
  
  year_out <- data.table(epobject_id = docvars(debates_dfm)$epobject_id,
                         section_id = docvars(debates_dfm)$section_id,
                         person_id = docvars(debates_dfm)$person_id,
                         adjusted_sent = texts(debates_test_corpus), 
                         dict_scores)
  
  year_out_list[[i]] <- year_out
  
}

debate_title_scores <- data.table(rbind.fill(year_out_list))

save(debate_title_scores, file = "working/debate_title_scores.Rdata")

## Top debate titles 

top_defence <- debate_title_scores[order(debate_title_scores$dic_defence, decreasing = TRUE), c("adjusted_sent", "dic_defence")] 
top_defence <- unique(top_defence$adjusted_sent)[1:20]
top_economy <- debate_title_scores[order(debate_title_scores$dic_economy, decreasing = TRUE), c("adjusted_sent", "dic_economy")]
top_economy <- unique(top_economy$adjusted_sent)[1:20]
top_welfare <- debate_title_scores[order(debate_title_scores$dic_social, decreasing = TRUE), c("adjusted_sent", "dic_social")]
top_welfare <- unique(top_welfare$adjusted_sent)[1:20]
top_agriculture <- debate_title_scores[order(debate_title_scores$dic_agriculture, decreasing = TRUE), c("adjusted_sent", "dic_agriculture")]
top_agriculture <- unique(top_agriculture$adjusted_sent)[1:20]
top_health <- debate_title_scores[order(debate_title_scores$dic_health, decreasing = TRUE), c("adjusted_sent", "dic_health")]
top_health <- unique(top_health$adjusted_sent)[1:20]
top_children <- debate_title_scores[order(debate_title_scores$dic_children, decreasing = TRUE), c("adjusted_sent", "dic_children")]
top_children <- unique(top_children$adjusted_sent)[1:20]
top_education <- debate_title_scores[order(debate_title_scores$dic_education, decreasing = TRUE), c("adjusted_sent", "dic_education")]
top_education <- unique(top_education$adjusted_sent)[1:20]
top_trade <- debate_title_scores[order(debate_title_scores$dic_trade, decreasing = TRUE), c("adjusted_sent", "dic_trade")]
top_trade <- unique(top_trade$adjusted_sent)[1:20]
top_environment <- debate_title_scores[order(debate_title_scores$dic_environment, decreasing = TRUE), c("adjusted_sent", "dic_environment")]
top_environment <- unique(top_environment$adjusted_sent)[1:20]
top_energy <- debate_title_scores[order(debate_title_scores$dic_energy, decreasing = TRUE), c("adjusted_sent", "dic_energy")]
top_energy <- unique(top_energy$adjusted_sent)[1:20]
top_crime <- debate_title_scores[order(debate_title_scores$dic_crime, decreasing = TRUE), c("adjusted_sent", "dic_crime")]
top_crime <- unique(top_crime$adjusted_sent)[1:20]
top_transport <- debate_title_scores[order(debate_title_scores$dic_transport, decreasing = TRUE), c("adjusted_sent", "dic_transport")]
top_transport <- unique(top_transport$adjusted_sent)[1:20]

save(top_defence, top_economy, top_agriculture, top_health, top_children, 
     top_education, top_welfare, top_trade, top_environment, top_energy, 
     top_crime, top_transport, file = "working/validation/top_debate_titles.Rdata")
